/*
Creates a worker-year panel from year-state RAIS files
		// input: year-state RAIS files, microregions_municipality_concordance
		// output: rais_20102017, matleave, workeryear_panel
Three sections with toogles		
	APPEND: all employment-spell observations in Brazil using RAIS 2011-2017
		Output: rais_20102017.dta
	LOA: maternity leave information at the employment-spell level to be merged with RAIS
		Output: matleave.dta
	CLEAN: worker-year panel from RAIS 2011-2017
		Output: rais_workeryr_panel.dta
*/

* Toggles 
local append 	= 1 	// append data
local loa 		= 1  	// collapses leaves of absence data at employmnet spell (to be merged with main rais data)
local clean 	= 1 	// creates worker-year panel and main contemporaneous variables


*---------------------------------

***---- APPEND ---

if `append' {

	* Log file
	cap log close
	log using "$logs/wypanel_append", replace 

	* Loop over each state
	global state "RR AC AP TO PI SE AL RO PB AM RN MA MS MT ES PA DF CE GO PE BA SC RS PR MG RJ SP"
	foreach state in $state {
		di "`state'"
		
		* Loop over each year
		use "$files/rais/`state'/`state'2011_RAIS", clear
		gen year =2011
		gen state ="`state'"
		forvalues year = 2012(1)2017 {
			di `year'
			
			* Append across years within state
			append using "$files/rais/`state'/`state'`year'_RAIS"	
			replace year=`year' if year==.
			replace state = "`state'" if state==""
		}
		
		* Rename variables
		rename pis				fakeid_worker 
		rename firmidcnpj 		fakeid_firm 
		rename estabid			fakeid_estab
		rename dob 				birthdate
		rename education 		educ
		rename cnae20subcl 		cnaesubclass20
		rename pat 				estabpat
		rename simples 			indsimples
		rename concla 			juridnature
		rename hiredate 		admdate
		rename hiretype 		admtype
		rename emptype 			contracttype
		rename wagetype 		contractsaltype
		rename contractwage 	contractsal
		rename contracthours	contracthours
		rename remunavgminw 	earningsavgmw
		rename remunavgnom 		earningsavgnom
		rename remundecminw 	earningsdecmw
		rename remundecnom 		earningsdecnom
		rename employed 		emp1231
		rename tenure 			empmonths
		rename sepcause 		sepreason
		rename cpf				workerid_cpf
		rename firmtype			estabid_type
		rename leave1endday		LOA_endday1
		rename leave1endmonth	LOA_endmonth1
		rename leave1startday	LOA_startday1
		rename leave1startmonth	LOA_startmonth1
		rename leave2endday		LOA_endday2
		rename leave2endmonth	LOA_endmonth2
		rename leave2startday	LOA_startday2
		rename leave2startmonth	LOA_startmonth2
		rename leave3endday		LOA_endday3
		rename leave3endmonth	LOA_endmonth3
		rename leave3startday	LOA_startday3
		rename leave3startmonth	LOA_startmonth3
		rename leavecause1		LOA_cause1
		rename leavecause2		LOA_cause2
		rename leavecause3		LOA_cause3
		rename leavetotaldays	LOA_days
		rename hiremonth 		admmonth
		
		* Append across states and save
		if "`state'"!="RR" {
			append using "$files/rais/BR/rais_20102017.dta"
		}
		//keep only the variables we need to run the rest
		keep fakeid_worker fakeid_firm fakeid_estab birthdate gender race educ disability state contracttype contractsal contractsaltype contracthours earningsavgmw earningsdecmw emp1231 year  sepday sepmonth sepreason year age  LOA_* earningsavgnom earningsdecnom empmonths cbo02 cnae20class earningsavgnom admmonth municipality estabid_type cnaesubclass20 juridnature 
		compress
		save "$files/rais/BR/rais_20102017.dta", replace
			
	}
	
	cap log close
	
}


***---- LOA ---

if `loa' {

	* Log file
	cap log close
	log using "$logs/wypanel_loa", replace
	
	* Load appended data
	use "$files/rais/BR/rais_20102017.dta", clear
	// only keep maternity leave of absence (LOA) events
	keep if LOA_cause1==50 | LOA_cause2==50 | LOA_cause3==50

	/********************************************************************
		Clean start and end dates for LOA events
		(generate a duration variable)
	*******************************************************************/

	tostring year, gen(yearstr)
			
	forvalues i = 1/3 {
		tostring LOA_startday`i', replace
		tostring LOA_startmonth`i', replace
		tostring LOA_endmonth`i', replace
		tostring LOA_endday`i', replace
	}
		
	forvalues i = 1/3 {
		gen a`i'=strlen(LOA_startday`i')
		replace LOA_startday`i'="0"*(2-a`i')+LOA_startday`i'

		gen b`i' = strlen(LOA_startmonth`i')
		replace LOA_startmonth`i'="0"*(2-b`i')+LOA_startmonth`i'
		
		gen c`i' = strlen(LOA_endmonth`i')
		replace LOA_endmonth`i'="0"*(2-c`i')+LOA_endmonth`i'
		
		gen d`i' = strlen(LOA_endday`i')
		replace LOA_endday`i'="0"*(2-d`i')+LOA_endday`i'
				
		gen startdate`i'=LOA_startmonth`i'+"/"+LOA_startday`i'+"/"+yearstr if LOA_cause`i'!=-1 & LOA_cause`i'!=99 
		gen enddate`i'=LOA_endmonth`i'+"/"+LOA_endday`i'+"/"+yearstr if LOA_cause`i'!=-1 & LOA_cause`i'!=99
		
		gen start`i'=date(startdate`i', "MDY")
		gen end`i'=date(enddate`i', "MDY")
		gen duration`i'=end`i'-start`i'
		
	}


	/********************************************************************
		Drop duplicates 
		(unique worker-establishment-year observations)
	******************************************************************/

	//worker has more than one maternity LOA event in an establishment-year
	duplicates tag fakeid_worker fakeid_estab year, gen(uniqueworkerestyear) 
	tab uniqueworkerestyear
	drop if uniqueworkerestyear>0 
		

	/********************************************************************
		Add leaves taken in consecutive years
	********************************************************************/

	sort fakeid_worker fakeid_estab year
	
	* for how many is the start date on jan 1?
	gen jan1 = (substr(startdate1,1,5)=="01/01")|(substr(startdate2,1,5)=="01/01")|(substr(startdate3,1,5)=="01/01")
	tab jan1
	drop jan1

	* setup for adding durations if start on jan 1, and end on dec 31 of prev year
	forvalues i = 1/3 {
		gen startjan1`i'=(LOA_startday`i'=="01" & LOA_startmonth`i'=="01") if LOA_cause`i'==50
		gen enddec31`i'=(LOA_endday`i'=="31" & LOA_endmonth`i'=="12") if LOA_cause`i'==50
	}
	//Missing are more likely as we move from LOA1 to LOA3
	//but conditional on this, it's also much less likely to have a leave starting in Jan 1st as we move from LOA1 to LOA3
	sum startjan1*

	***************
	* LOA Cause 1
	***************
	
	* add days of next leave to this one, because this one ends of dec 31 of the year.
	bysort fakeid_worker: gen adddays11=1 if enddec311[_n]==1 & startjan11[_n+1]==1
	bysort fakeid_worker: gen adddays12=1 if enddec311[_n]==1 & startjan12[_n+1]==1
	bysort fakeid_worker: gen adddays13=1 if enddec311[_n]==1 & startjan13[_n+1]==1
	gen durationadded1=duration1
	
	* add durations (prioritize LOA1 over LOA2 over LOA3 from t+1)
	bysort fakeid_worker: replace durationadded1=duration1[_n] + duration3[_n+1] if adddays13[_n]==1
	bysort fakeid_worker: replace durationadded1=duration1[_n] + duration2[_n+1] if adddays12[_n]==1
	bysort fakeid_worker: replace durationadded1=duration1[_n] + duration1[_n+1] if adddays11[_n]==1
	
	* remove observation if added to the previous year
	bysort fakeid_worker: gen removeobs1=1 if (adddays11[_n-1]==1)|(adddays12[_n-1]==1)|(adddays13[_n-1]==1)
	replace durationadded1=. if removeobs1==1
	replace durationadded1=. if LOA_cause1!=50

	***************
	* LOA Cause 2
	***************

	* end on dec 31, next starts on jan 1, haven't already added the days to cause 1.
	bysort fakeid_worker: gen adddays21=1 if enddec312[_n]==1 & startjan11[_n+1]==1 & removeobs1[_n+1]!=1
	bysort fakeid_worker: gen adddays22=1 if enddec312[_n]==1 & startjan12[_n+1]==1 & removeobs1[_n+1]!=1
	bysort fakeid_worker: gen adddays23=1 if enddec312[_n]==1 & startjan13[_n+1]==1 & removeobs1[_n+1]!=1
	gen durationadded2=duration2
	
	* add durations (prioritize LOA1 over LOA2 over LOA3 from t+1)
	bysort fakeid_worker: replace durationadded2=duration2[_n] + duration3[_n+1] if adddays23[_n]==1
	bysort fakeid_worker: replace durationadded2=duration2[_n] + duration2[_n+1] if adddays22[_n]==1
	bysort fakeid_worker: replace durationadded2=duration2[_n] + duration1[_n+1] if adddays21[_n]==1
	
	* remove observation if added to the previous year
	bysort fakeid_worker: gen removeobs2=1 if (adddays21[_n-1]==1)|(adddays22[_n-1]==1)|(adddays23[_n-1]==1)
	replace durationadded2=. if removeobs2==1 //LL: added this line
	replace durationadded2=. if LOA_cause2!=50

	***************
	* LOA Cause 3
	***************

	* end on dec 31, next starts on jan 1, haven't already added the days to cause 1 or 2
	bysort fakeid_worker: gen adddays31=1 if enddec313[_n]==1 & startjan11[_n+1]==1 & removeobs1[_n+1]!=1 & removeobs2[_n+1]!=1
	bysort fakeid_worker: gen adddays32=1 if enddec313[_n]==1 & startjan12[_n+1]==1 & removeobs1[_n+1]!=1 & removeobs2[_n+1]!=1
	bysort fakeid_worker: gen adddays33=1 if enddec313[_n]==1 & startjan13[_n+1]==1 & removeobs1[_n+1]!=1 & removeobs2[_n+1]!=1
	gen durationadded3=duration3
	
	* add durations (prioritize LOA1 over LOA2 over LOA3 from t+1)
	bysort fakeid_worker: replace durationadded3=duration3[_n] + duration3[_n+1] if adddays33[_n]==1
	bysort fakeid_worker: replace durationadded3=duration3[_n] + duration2[_n+1] if adddays32[_n]==1
	bysort fakeid_worker: replace durationadded3=duration3[_n] + duration1[_n+1] if adddays31[_n]==1
	
	* remove observation if added to the previous year
	bysort fakeid_worker: gen removeobs3=1 if (adddays31[_n-1]==1)|(adddays32[_n-1]==1)|(adddays33[_n-1]==1)
	replace durationadded3=. if removeobs3==1 
	replace durationadded3=. if LOA_cause3!=50
	
	sum durationadded1, d
	sum durationadded2, d
	sum durationadded3, d
		
			
	/***************************************************************************
		Create the maternity dummies
	 ***************************************************************************/

	* indicator for LOA of more than 120 days
	forvalues i = 1/3 {
		gen matmore120`i'=(durationadded`i'>120) if LOA_cause`i'==50 & durationadded`i'!=.
	}
		
	lab var durationadded1 "FINAL: duration of first leave if maternity"
	lab var durationadded2 "FINAL: duration of second leave if maternity"	
	lab var durationadded3 "FINAL: duration of third leave if maternity"

	lab var matmore1201 "FINAL: more than 120 days if first maternity leave"
	lab var matmore1202 "FINAL: more than 120 days if second maternity leave"
	lab var matmore1203 "FINAL: more than 120 days if third maternity leave"

	egen anymatmore120=rowmax(matmore1201 matmore1202 matmore1203)
	lab var anymatmore120 "FINAL: any of the 3 possible maternity leaves more than 120 days"

	egen maxdurationadded=rowmax(durationadded1 durationadded2 durationadded3)
	lab var maxdurationadded "FINAL: duration of longest maternity leave taken"

	replace duration1=. if LOA_cause1!=50
	replace duration2=. if LOA_cause2!=50
	replace duration3=. if LOA_cause3!=50

	lab var duration1 "RAW: duration of first leave if maternity"
	lab var duration2 "RAW: duration of second leave if maternity"
	lab var duration3 "RAW: duration of third leave if maternity"

	egen maxduration=rowmax(duration1 duration2 duration3)
	lab var maxduration "RAW: duration of longest maternity leave taken in the year"
	
	egen sumduration=rowtotal(duration1 duration2 duration3)
	lab var sumduration "RAW: total maternity leave taken in the year"
	
	gen keepjob = (emp1231==1)
	
	* Save maternity leave dataset (to be merged)
	keep fakeid_worker fakeid_estab year duration* matmore* LOA_cause* max* any* keepjob
	mdesc
	drop if maxdurationadded==.
	compress
	save "$files/rais/BR/matleave.dta", replace 
		
	cap log close	

}


***---- CLEAN ---
	
if `clean' {

	* Log file
	cap log close
	log using "$logs/wypanel_clean", replace 
	
	* Load appended data
	use "$files/rais/BR/rais_20102017.dta", clear


	//Create dummies for injury related leave (10/30 and 10-40)
	gen injury_10_30 = 1 if LOA_cause1==10 | LOA_cause2==10 | LOA_cause3==10 | LOA_cause1==30 | LOA_cause2==30 | LOA_cause3==30
	replace injury_10_30=0 if injury_10_30==.
	
	gen injury_all = 1 if injury_10_30==1 | LOA_cause1==20 | LOA_cause2==20 | LOA_cause3==20 | LOA_cause1==40 | LOA_cause2==40 | LOA_cause3==40 
	replace injury_all=0 if injury_all==.
	
	/* Keep main employment spell per worker */

	//Ensure duration to be >=1
	replace admmonth=0 if admmonth==.
	gen sepmonthxxx = sepmonth
	replace sepmonthxxx = 12 if sepmonth==0
	gen duration=sepmonthxxx-admmonth
	replace duration=1 if duration<=0
	replace duration=12 if duration==.
	drop sepmonthxxx
		
	* number of months of employment in year
	gegen durationtot=sum(duration), by(fakeid_worker year)		
	lab var durationtot "Total months of employment in year"
	
	* avg. earnings
	gen earningstotnom=earningsavgnom*duration
	ren earningsavgnom earningsavgnom1
	gegen earningstot=sum(earningstotnom), by(fakeid_worker year)
	lab var earningstot "Total earnings for the year"
	//irrespective of employer
	gen earningsavgnom=earningstot/durationtot
	lab var earningsavgnom "Average monthly earnings"
	//specific to the year-spell observation
	lab var earningsavgnom1 "Average monthly earnings"
	
	* keep employer for which worked the longest duration
	gegen maxdur=max(duration), by(fakeid_worker year)
	keep if duration==maxdur
	
	* Generate wage from highest wage employer
	gegen maxwage=max(earningsavgnom1), by(fakeid_worker year)
	
	* If same employer, can drop one entry WLOG about employer identity
	duplicates drop fakeid_worker fakeid_estab year, force
	duplicates tag fakeid_worker year, gen(tag2)
	tab tag2 
	
	* Drop at random if have more than 1 employer
	duplicates drop fakeid_worker year, force
	
	
	/* Merge with LOA (maternity) variables: worker-establishment-year obs */
	cap drop _merge
	merge 1:1 fakeid_worker fakeid_estab year using "$files/rais/BR/matleave.dta"
	drop if _merge ==2 
	gen any_matleave = (_merge == 3)
	drop _merge


	*** Worker -level variables ***
	*******************************
	
	* Gender: gender
	gen female = (gender == 2) if gender!=. 
	label var female "Female"
	
	* Childbearing age
	gen cbage = (inrange(age, 20, 35)) if age!=. 
	gen cbage_female = (cbage == 1 & female == 1)
	label var cbage "Between 20 and 35 yo"

	* Wages
	gen lnwage=ln(earningsavgnom) 
	gen lndecwage=ln(earningsdecnom)
	rename earningsavgnom wage
	rename earningsdecnom decwage
	
	* Tenure
	rename empmonths ten
	
	* Occupation
	rename cbo02 occ
	
	* Manager
	tostring occ, gen(cbostr)
	gen cbo2=substr(cbostr, 1, 2)
	gen manager=(cbo2=="12" | cbo2=="13" | cbo2=="14")
	
	*Supervisor
	gen occ3=substr(cbostr, 1, 3)
	gen byte supervisor=(occ3=="111")
	replace supervisor=1 if occ3=="113"
	replace supervisor=1 if occ3=="114"
	replace supervisor=1 if occ3=="121"
	replace supervisor=1 if occ3=="122"
	replace supervisor=1 if occ3=="123"
	replace supervisor=1 if occ3=="131"
	replace supervisor=1 if occ3=="141"
	replace supervisor=1 if occ3=="142"
	replace supervisor=1 if occ3=="410"
	replace supervisor=1 if occ3=="420"
	replace supervisor=1 if occ3=="510"
	replace supervisor=1 if occ3=="520"
	replace supervisor=1 if occ3=="620"
	replace supervisor=1 if occ3=="630"
	replace supervisor=1 if occ3=="710"
	replace supervisor=1 if occ3=="720"
	replace supervisor=1 if occ3=="730"
	replace supervisor=1 if occ3=="740"
	replace supervisor=1 if occ3=="750"
	replace supervisor=1 if occ3=="760"
	replace supervisor=1 if occ3=="770"
	replace supervisor=1 if occ3=="780"
	replace supervisor=1 if occ3=="810"
	replace supervisor=1 if occ3=="820"
	replace supervisor=1 if occ3=="830"
	replace supervisor=1 if occ3=="840"
	replace supervisor=1 if occ3=="860"
	replace supervisor=1 if occ3=="950"
	drop occ3
	
	* Hire from another establishment
	xtset fakeid_worker year
	gen newhire_EE = (fakeid_estab!=l.fakeid_estab & l.fakeid_estab!=.) if year != 2011

	* At baseline info
	cap drop temp 
	
	gen double temp = fakeid_estab if year == 2014
	gegen double bl_employer = max(temp), by(fakeid_worker)
	drop temp 
	gen atblemp=(fakeid_estab==bl_employer)
		
	gen temp = cbage if year == 2014
	gegen cbage_bl = max(temp) , by(fakeid_worker)
	gen atblemp_cbage = atblemp if cbage_bl == 1
	drop temp cbage_bl
	
						
	* Likelihood of not being observed at firm after maternity leave
	// dummy defined only after woman takes first maternity leave (so missing before and missing for men and women not taking maternity)
	// it is equal to 1 if at same estab as the one in which take first mat leave
	cap drop temp
	gen temp = year if any_matleave == 1 & female ==1
	gegen yearmat = min(temp), by(fakeid_worker)
	drop temp
	
	gen double temp = fakeid_estab if year == yearmat
	gegen estabmat = max(temp), by(fakeid_worker)
	drop temp
	
	gen atmatestab = 1 if yearmat!=. & year>= yearmat
	replace atmatestab = 0 if fakeid_estab!=estabmat & yearmat!=. & year>= yearmat
	
	label var atmatestab "1 if still at the estab where took first maternity leave"

	
	
	*** Establishment - level variables ***
	***************************************		
	
	* micro-region
	cap drop _merge
	merge m:1 municipality using "$raw/microregions_municipality_concordance.dta", keepusing(microregion)
	drop if _merge==2
	drop _merge
	
	* modal establishment attributes
	gegen ind_mode 			= mode(cnae20class),		by(fakeid_estab) minmode
	gegen municipality_mode = mode(municipality), 		by(fakeid_estab) minmode
	gegen estabidtype_mode 	= mode(estabid_type), 		by(fakeid_estab) minmode
	gegen juridnature_mode 	= mode(juridnature), 		by(fakeid_estab) minmode
	gegen cnaesubcl_mode 	= mode(cnaesubclass20), 	by(fakeid_estab) minmode

	* total employment (over the entire year)
	cap drop temp
	gen temp = 1
	gegen emptot = total(temp), by(fakeid_estab year)
	
	gegen empfem = total(female), by(fakeid_estab year)
	gen byte male = 1-female
	gegen empmal = total(male), by(fakeid_estab year)
	
	* total employment bw 20-35 and female emp bw 20-35
	gegen emptot_cbage = total(cbage), by(fakeid_estab year)
	gegen empfem_cbage = total(cbage_female), by(fakeid_estab year)
	
	* indicator for it employs both men and women 
	gen byte both_FM = (empfem!=0 & empmal!=0 & empfem!=. & empmal!=.) 

	label var emptot "Total employment at establishment"
	label var empfem "Total female employment at establishment"
	label var empmal "Total male employment at establishment"
	label var emptot_cbage "Total workers bw 20 and 35 yo"
	label var empfem_cbage "Total female workers bw 20 and 35 yo"
	label var both_FM "Employs both men and women"

	* average wage	(total and by gender)
	gegen avgwage_est = mean(wage), by(fakeid_estab year)
	gegen avgwagefem_est = mean(wage) if fem==1, by(fakeid_estab year)
	gegen avgwagemal_est = mean(wage) if fem==0, by(fakeid_estab year)
	
	/* Save */
	sort fakeid_worker  year		
	keep fakeid_worker fakeid_firm fakeid_estab birthdate gender race educ disability occ cbostr contracttype contractsal contractsaltype contracthours earningsavgmw earningsavgnom1 earningsdecmw decwage emp1231 year ten sepday sepmonth sepreason age cnae20class  duration earningstotnom durationtot earningstot wage maxdur maxwage anymatmore120 maxdurationadded maxduration keepjob any_matleave female cbage cbage_female lnwage lndecwage cbo2 manager supervisor newhire_EE yearmat estabmat atmatestab microregion ind_mode municipality_* estabidtype_mode juridnature_mode cnaesubcl_mode emptot empfem  empmal emptot_cbage empfem_cbage both_FM avgwage_est avgwagefem_est avgwagemal_est bl_employer atblemp atblemp_cbage injury*
	order  fakeid_worker fakeid_firm fakeid_estab birthdate gender race educ disability occ cbostr contracttype contractsal contractsaltype contracthours earningsavgmw earningsavgnom1 earningsdecmw decwage emp1231 year ten sepday sepmonth sepreason age cnae20class  duration earningstotnom durationtot earningstot wage maxdur maxwage anymatmore120 maxdurationadded maxduration keepjob any_matleave female cbage cbage_female lnwage lndecwage cbo2 manager supervisor newhire_EE yearmat estabmat atmatestab microregion ind_mode municipality_* estabidtype_mode juridnature_mode cnaesubcl_mode emptot empfem  empmal emptot_cbage empfem_cbage both_FM avgwage_est avgwagefem_est avgwagemal_est bl_employer atblemp atblemp_cbage injury*
	mdesc
	compress
	save "$files/rais/BR/workeryear_panel.dta", replace

	
	cap log close		
	
}	


